Skip to content

Conversation

@rampitec
Copy link
Collaborator

No description provided.

Copy link
Collaborator Author

rampitec commented Jul 14, 2025

@rampitec rampitec requested review from changpeng and shiltian July 14, 2025 21:22
@rampitec rampitec marked this pull request as ready for review July 14, 2025 21:23
@llvmbot
Copy link
Member

llvmbot commented Jul 14, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Stanislav Mekhanoshin (rampitec)

Changes

Patch is 45.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/148727.diff

6 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+31-2)
  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+5)
  • (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+28-3)
  • (modified) llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp (+5-1)
  • (modified) llvm/test/CodeGen/AMDGPU/code-size-estimate.ll (+314)
  • (added) llvm/test/CodeGen/AMDGPU/literal64.ll (+324)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 2540921b75e5d..9adf6f7cb1b8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -447,6 +447,35 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
     return;
   }
 
+  bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
+  if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
+      CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
+    uint64_t C = 0;
+    bool AllConst = true;
+    unsigned EltSize = EltVT.getSizeInBits();
+    for (unsigned I = 0; I < NumVectorElts; ++I) {
+      SDValue Op = N->getOperand(I);
+      if (Op.isUndef()) {
+        AllConst = false;
+        break;
+      }
+      uint64_t Val;
+      if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Op)) {
+        Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
+      } else
+        Val = cast<ConstantSDNode>(Op)->getZExtValue();
+      C |= Val << (EltSize * I);
+    }
+    if (AllConst) {
+      SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
+      MachineSDNode *Copy = CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO,
+                                                   DL, VT, CV);
+      CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
+                           RegClass);
+      return;
+    }
+  }
+
   assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
                                   "supported yet");
   // 32 = Max Num Vector Elements
@@ -454,7 +483,6 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
   // 1 = Vector Register Class
   SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
 
-  bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
   RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
   bool IsRegSeq = true;
   unsigned NOps = N->getNumOperands();
@@ -676,7 +704,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
 
   case ISD::Constant:
   case ISD::ConstantFP: {
-    if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
+    if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
+        Subtarget->has64BitLiterals())
       break;
 
     uint64_t Imm;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e90316cee12fe..21bd017540b09 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12155,6 +12155,11 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
   if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
        bitOpWithConstantIsReducible(Opc, ValHi)) ||
       (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
+    // We have 64-bit scalar and/or/xor, but do not have vector forms.
+    if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
+        !CRHS->user_begin()->isDivergent())
+      return SDValue();
+
     // If we need to materialize a 64-bit immediate, it will be split up later
     // anyway. Avoid creating the harder to understand 64-bit immediate
     // materialization.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 4c5f938831243..20a8da4a317db 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2273,6 +2273,12 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
     const MachineOperand &SrcOp = MI.getOperand(1);
     assert(!SrcOp.isFPImm());
+
+    if (ST.has64BitLiterals()) {
+      MI.setDesc(get(AMDGPU::S_MOV_B64));
+      break;
+    }
+
     APInt Imm(64, SrcOp.getImm());
     if (Imm.isIntN(32) || isInlineConstant(Imm)) {
       MI.setDesc(get(AMDGPU::S_MOV_B64));
@@ -6099,14 +6105,18 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
                      OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
     if (Is64BitOp &&
         !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
-      if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
+      if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
+          (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
         return false;
 
       // FIXME: We can use sign extended 64-bit literals, but only for signed
       //        operands. At the moment we do not know if an operand is signed.
       //        Such operand will be encoded as its low 32 bits and then either
       //        correctly sign extended or incorrectly zero extended by HW.
-      if (!Is64BitFPOp && (int32_t)Imm < 0)
+      //        If 64-bit literals are supported and the literal will be encoded
+      //        as full 64 bit we still can use it.
+      if (!Is64BitFPOp && (int32_t)Imm < 0 &&
+          (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
         return false;
     }
   }
@@ -9178,15 +9188,30 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     if (isDPP(MI))
       return DescSize;
     bool HasLiteral = false;
+    unsigned LiteralSize = 4;
     for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
       const MachineOperand &Op = MI.getOperand(I);
       const MCOperandInfo &OpInfo = Desc.operands()[I];
       if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
         HasLiteral = true;
+        if (ST.has64BitLiterals()) {
+          switch(OpInfo.OperandType) {
+          default:
+            break;
+          case AMDGPU::OPERAND_REG_IMM_FP64:
+            if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
+              LiteralSize = 8;
+            break;
+          case AMDGPU::OPERAND_REG_IMM_INT64:
+            if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
+              LiteralSize = 8;
+            break;
+          }
+        }
         break;
       }
     }
-    return HasLiteral ? DescSize + 4 : DescSize;
+    return HasLiteral ? DescSize + LiteralSize : DescSize;
   }
 
   // Check whether we have extra NSA words.
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index fd39b8a1350c6..4a4b865dc5d1d 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -1058,7 +1058,11 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
       // fold an immediate into the shrunk instruction as a literal operand. In
       // GFX10 VOP3 instructions can take a literal operand anyway, so there is
       // no advantage to doing this.
-      if (ST->hasVOP3Literal() && !IsPostRA)
+      // However, if 64-bit literals are allowed we still need to shrink it
+      // for such literal to be able to fold.
+      if (ST->hasVOP3Literal() &&
+          (!ST->has64BitLiterals() || AMDGPU::isTrue16Inst(MI.getOpcode())) &&
+          !IsPostRA)
         continue;
 
       if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) &&
diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
index ac03d2dae8fa8..dea9142cf2bee 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
@@ -1,8 +1,10 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,NOT-GFX12 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10,NOT-GFX12 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX1100,NOT-GFX12 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX1150,NOT-GFX12 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX1200 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
 
 declare float @llvm.fabs.f32(float)
 declare float @llvm.fma.f32(float, float, float)
@@ -35,11 +37,19 @@ define float @v_mul_f32_vop2(float %x, float %y) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_mul_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x10]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f32_vop2:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x10]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %mul = fmul float %x, %y
   ret float %mul
 }
 ; NOT-GFX12: codeLenInByte = 12
 ; GFX1200: codeLenInByte = 28
+; GFX1250: codeLenInByte = 16
 
 define float @v_mul_f32_vop2_inline_imm(float %x) {
 ; GFX9-LABEL: v_mul_f32_vop2_inline_imm:
@@ -69,11 +79,19 @@ define float @v_mul_f32_vop2_inline_imm(float %x) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_mul_f32_e32 v0, 4.0, v0 ; encoding: [0xf6,0x00,0x00,0x10]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f32_vop2_inline_imm:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f32_e32 v0, 4.0, v0 ; encoding: [0xf6,0x00,0x00,0x10]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %mul = fmul float %x, 4.0
   ret float %mul
 }
 ; NOT-GFX12: codeLenInByte = 12
 ; GFX1200: codeLenInByte = 28
+; GFX1250: codeLenInByte = 16
 
 define float @v_mul_f32_vop2_literal(float %x) {
 ; GFX9-LABEL: v_mul_f32_vop2_literal:
@@ -103,11 +121,19 @@ define float @v_mul_f32_vop2_literal(float %x) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_mul_f32_e32 v0, 0x42f60000, v0 ; encoding: [0xff,0x00,0x00,0x10,0x00,0x00,0xf6,0x42]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f32_vop2_literal:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f32_e32 v0, 0x42f60000, v0 ; encoding: [0xff,0x00,0x00,0x10,0x00,0x00,0xf6,0x42]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %mul = fmul float %x, 123.0
   ret float %mul
 }
 ; NOT-GFX12: codeLenInByte = 16
 ; GFX1200: codeLenInByte = 32
+; GFX1250: codeLenInByte = 20
 
 define float @v_mul_f32_vop3_src_mods(float %x, float %y) {
 ; GFX9-LABEL: v_mul_f32_vop3_src_mods:
@@ -137,12 +163,20 @@ define float @v_mul_f32_vop3_src_mods(float %x, float %y) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x00]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f32_vop3_src_mods:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x00]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %mul = fmul float %fabs.x, %y
   ret float %mul
 }
 ; NOT-GFX12: codeLenInByte = 16
 ; GFX1200: codeLenInByte = 32
+; GFX1250: codeLenInByte = 20
 
 define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
 ; GFX9-LABEL: v_mul_f32_vop3_src_mods_inline_imm:
@@ -172,6 +206,13 @@ define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x00]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f32_vop3_src_mods_inline_imm:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x00]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %mul = fmul float %fabs.x, 4.0
   ret float %mul
@@ -179,6 +220,7 @@ define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
 
 ; NOT-GFX12: codeLenInByte = 16
 ; GFX1200: codeLenInByte = 32
+; GFX1250: codeLenInByte = 20
 
 define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
 ; GFX9-LABEL: v_mul_f32_vop3_src_mods_literal:
@@ -209,6 +251,13 @@ define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x00,0x00,0x00,0xf6,0x42]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f32_vop3_src_mods_literal:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x00,0x00,0x00,0xf6,0x42]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %mul = fmul float %fabs.x, 123.0
   ret float %mul
@@ -218,6 +267,7 @@ define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
 ; GFX10: codeLenInByte = 20
 ; GFX11: codeLenInByte = 20
 ; GFX1200: codeLenInByte = 36
+; GFX1250: codeLenInByte = 24
 
 define float @v_mul_f32_vop2_frame_index(float %x) {
 ; GFX9-LABEL: v_mul_f32_vop2_frame_index:
@@ -249,6 +299,13 @@ define float @v_mul_f32_vop2_frame_index(float %x) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_mul_f32_e32 v0, s32, v0 ; encoding: [0x20,0x00,0x00,0x10]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f32_vop2_frame_index:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f32_e32 v0, s32, v0 ; encoding: [0x20,0x00,0x00,0x10]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %alloca = alloca i32, addrspace(5)
   %ptrtoint = ptrtoint ptr addrspace(5) %alloca to i32
   %cast = bitcast i32 %ptrtoint to float
@@ -260,6 +317,7 @@ define float @v_mul_f32_vop2_frame_index(float %x) {
 ; GFX10: codeLenInByte = 20
 ; GFX11: codeLenInByte = 12
 ; GFX1200: codeLenInByte = 28
+; GFX1250: codeLenInByte = 16
 
 define float @v_fma_f32(float %x, float %y, float %z) {
 ; GFX9-LABEL: v_fma_f32:
@@ -289,12 +347,20 @@ define float @v_fma_f32(float %x, float %y, float %z) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_fma_f32 v0, v0, v1, v2 ; encoding: [0x00,0x00,0x13,0xd6,0x00,0x03,0x0a,0x04]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_fma_f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_fma_f32 v0, v0, v1, v2 ; encoding: [0x00,0x00,0x13,0xd6,0x00,0x03,0x0a,0x04]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fma = call float @llvm.fma.f32(float %x, float %y, float %z)
   ret float %fma
 }
 
 ; NOT-GFX12: codeLenInByte = 16
 ; GFX1200: codeLenInByte = 32
+; GFX1250: codeLenInByte = 20
 
 define float @v_fma_f32_src_mods(float %x, float %y, float %z) {
 ; GFX9-LABEL: v_fma_f32_src_mods:
@@ -324,6 +390,13 @@ define float @v_fma_f32_src_mods(float %x, float %y, float %z) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_fma_f32 v0, |v0|, v1, v2 ; encoding: [0x00,0x01,0x13,0xd6,0x00,0x03,0x0a,0x04]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_fma_f32_src_mods:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_fma_f32 v0, |v0|, v1, v2 ; encoding: [0x00,0x01,0x13,0xd6,0x00,0x03,0x0a,0x04]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z)
   ret float %fma
@@ -331,6 +404,7 @@ define float @v_fma_f32_src_mods(float %x, float %y, float %z) {
 
 ; NOT-GFX12: codeLenInByte = 16
 ; GFX1200: codeLenInByte = 32
+; GFX1250: codeLenInByte = 20
 
 define float @v_fmac_f32(float %x, float %y) {
 ; GFX9-LABEL: v_fmac_f32:
@@ -360,6 +434,13 @@ define float @v_fmac_f32(float %x, float %y) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_fmac_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x56]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_fmac_f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_fmac_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x56]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fma = call float @llvm.fma.f32(float %x, float %y, float %x)
   ret float %fma
 }
@@ -368,6 +449,7 @@ define float @v_fmac_f32(float %x, float %y) {
 ; GFX10: codeLenInByte = 12
 ; GFX11: codeLenInByte = 12
 ; GFX1200: codeLenInByte = 28
+; GFX1250: codeLenInByte = 16
 
 define float @v_fmaak_f32(float %x, float %y) {
 ; GFX9-LABEL: v_fmaak_f32:
@@ -398,6 +480,13 @@ define float @v_fmaak_f32(float %x, float %y) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_fmaak_f32 v0, v0, v1, 0x43800000 ; encoding: [0x00,0x03,0x00,0x5a,0x00,0x00,0x80,0x43]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_fmaak_f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_fmaak_f32 v0, v0, v1, 0x43800000 ; encoding: [0x00,0x03,0x00,0x5a,0x00,0x00,0x80,0x43]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fma = call float @llvm.fma.f32(float %x, float %y, float 256.0)
   ret float %fma
 }
@@ -406,6 +495,7 @@ define float @v_fmaak_f32(float %x, float %y) {
 ; GFX10: codeLenInByte = 16
 ; GFX11: codeLenInByte = 16
 ; GFX1200: codeLenInByte = 32
+; GFX1250: codeLenInByte = 20
 
 define float @v_fma_k_f32_src_mods(float %x, float %y) {
 ; GFX9-LABEL: v_fma_k_f32_src_mods:
@@ -436,6 +526,13 @@ define float @v_fma_k_f32_src_mods(float %x, float %y) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_fma_f32 v0, |v0|, v1, 0x43800000 ; encoding: [0x00,0x01,0x13,0xd6,0x00,0x03,0xfe,0x03,0x00,0x00,0x80,0x43]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_fma_k_f32_src_mods:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_fma_f32 v0, |v0|, v1, 0x4380000...
[truncated]

@github-actions
Copy link

github-actions bot commented Jul 14, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@rampitec rampitec force-pushed the users/rampitec/07-14-_amdgpu_use_64-bit_literals_in_codegen_on_gfx1250 branch from ee6b41f to 4fdfcf2 Compare July 14, 2025 21:27
Comment on lines +463 to +466
if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Op)) {
Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
} else
Val = cast<ConstantSDNode>(Op)->getZExtValue();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Op)) {
Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
} else
Val = cast<ConstantSDNode>(Op)->getZExtValue();
if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Op))
Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
else
Val = cast<ConstantSDNode>(Op)->getZExtValue();

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Matt was asking to use braces if we have defined a variable inside if.

Copy link
Contributor

@shiltian shiltian Jul 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, okay, then use {} for else. Use half of them is not a good idea.

I'm not sure if LLVM code standard says we need a {} for variable definition.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And again, I've been asked multiple times not to use it on else in this situation ;)

Base automatically changed from users/rampitec/07-14-_amdgpu_add_gfx1250_v_fmac_f64_implementation to main July 14, 2025 22:39
@rampitec rampitec merged commit a32040e into main Jul 14, 2025
9 checks passed
@rampitec rampitec deleted the users/rampitec/07-14-_amdgpu_use_64-bit_literals_in_codegen_on_gfx1250 branch July 14, 2025 22:47
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants